setwd("~/Desktop/Datos")
datos<-read.table("SAheart.csv",sep = ";",dec='.',header=TRUE)
str(datos)
'data.frame': 462 obs. of 10 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity: num 23.1 28.6 32.3 38 27.8 ...
$ famhist : chr "Present" "Absent" "Present" "Present" ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ chd : chr "Si" "Si" "No" "Si" ...
summary(datos)
sbp tobacco ldl adiposity
Min. :101.0 Min. : 0.0000 Min. : 0.980 Min. : 6.74
1st Qu.:124.0 1st Qu.: 0.0525 1st Qu.: 3.283 1st Qu.:19.77
Median :134.0 Median : 2.0000 Median : 4.340 Median :26.11
Mean :138.3 Mean : 3.6356 Mean : 4.740 Mean :25.41
3rd Qu.:148.0 3rd Qu.: 5.5000 3rd Qu.: 5.790 3rd Qu.:31.23
Max. :218.0 Max. :31.2000 Max. :15.330 Max. :42.49
famhist typea obesity alcohol
Length:462 Min. :13.0 Min. :14.70 Min. : 0.00
Class :character 1st Qu.:47.0 1st Qu.:22.98 1st Qu.: 0.51
Mode :character Median :53.0 Median :25.80 Median : 7.51
Mean :53.1 Mean :26.04 Mean : 17.04
3rd Qu.:60.0 3rd Qu.:28.50 3rd Qu.: 23.89
Max. :78.0 Max. :46.58 Max. :147.19
age chd
Min. :15.00 Length:462
1st Qu.:31.00 Class :character
Median :45.00 Mode :character
Mean :42.82
3rd Qu.:55.00
Max. :64.00
datos
mean(datos$age)
[1] 42.81602
var(datos$age)
[1] 213.4216
sd(datos$age)
[1] 14.60896
cor(datos$age,datos$obesity)
[1] 0.2917771
# cor(datos) Da un error ¿Porqué?
datos2<-datos[,-c(5,10)]
head(datos2)
cor(datos2)
sbp tobacco ldl adiposity typea
sbp 1.00000000 0.21224652 0.15829633 0.35650008 -0.05745431
tobacco 0.21224652 1.00000000 0.15890546 0.28664037 -0.01460788
ldl 0.15829633 0.15890546 1.00000000 0.44043175 0.04404758
adiposity 0.35650008 0.28664037 0.44043175 1.00000000 -0.04314364
typea -0.05745431 -0.01460788 0.04404758 -0.04314364 1.00000000
obesity 0.23806661 0.12452941 0.33050586 0.71655625 0.07400610
alcohol 0.14009559 0.20081339 -0.03340340 0.10033013 0.03949794
age 0.38877060 0.45033016 0.31179923 0.62595442 -0.10260632
obesity alcohol age
sbp 0.23806661 0.14009559 0.3887706
tobacco 0.12452941 0.20081339 0.4503302
ldl 0.33050586 -0.03340340 0.3117992
adiposity 0.71655625 0.10033013 0.6259544
typea 0.07400610 0.03949794 -0.1026063
obesity 1.00000000 0.05161957 0.2917771
alcohol 0.05161957 1.00000000 0.1011246
age 0.29177713 0.10112465 1.0000000
library(ellipse)
Attaching package: 'ellipse'
The following object is masked from 'package:graphics':
pairs
matriz.correlacion<-cor(datos2)
plotcorr(matriz.correlacion,col = "red")
library(corrplot)
corrplot 0.92 loaded
corrplot(matriz.correlacion)
col <- colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(matriz.correlacion, method="shade", shade.col=NA, tl.col="black", tl.srt=45,
col=col(200), addCoef.col="black", order="AOE")
library("ggplot2")
library("GGally")
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
ggpairs(datos2)
setwd("~/Desktop/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
str(Datos)
'data.frame': 462 obs. of 10 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity: num 23.1 28.6 32.3 38 27.8 ...
$ famhist : chr "Present" "Absent" "Present" "Present" ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ chd : chr "Si" "Si" "No" "Si" ...
dim(Datos)
[1] 462 10
head(Datos)
famhist.Present <- as.numeric(Datos$famhist == "Present")
print(famhist.Present)
[1] 1 0 1 1 1 1 0 1 1 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 0 0
[38] 0 1 1 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1
[75] 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 0 1 0 1 0 0 0 0 1 0 0 0
[112] 1 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 0 0 0 1 1
[149] 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 0 0 0 0 1 1 1 1
[186] 1 0 0 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
[223] 0 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 1 0 0 0 1 1 1 1 1 0
[260] 1 0 1 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
[297] 0 0 0 1 1 0 0 1 0 1 1 0 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1
[334] 0 0 1 1 0 1 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 1 0
[371] 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 1 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 1 0 1 1
[408] 1 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0
[445] 0 1 0 1 1 1 0 0 1 1 0 0 1 0 0 0 0 1
famhist.Absent <- as.numeric(Datos$famhist == "Absent")
print(famhist.Absent)
[1] 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1 0 1 1 0 1 1 1 0 1 1 1 1
[38] 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 1 0
[75] 1 1 1 0 1 0 0 1 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 1 0 1 1 1 1 0 1 1 1
[112] 0 1 0 1 0 1 1 0 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 1 1 1 1 0 0
[149] 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 0 0
[186] 0 1 1 0 0 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0
[223] 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 0 1 0 1 1 1 0 0 0 0 0 1
[260] 0 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1
[297] 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 1 0 1 1 1 0 1 0 1 0 0 1 0 1 1 0 1 1 1 1 0 0
[334] 1 1 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 1
[371] 0 0 1 0 0 1 1 0 1 1 1 0 0 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 0 1 1 1 0 0 1 0 0
[408] 0 1 1 1 1 1 0 0 0 1 0 0 1 1 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1
[445] 1 0 1 0 0 0 1 1 0 0 1 1 0 1 1 1 1 0
chd.Si <- as.numeric(Datos$chd == "Si")
print(chd.Si)
[1] 1 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 1 1 1 1 0 1 0
[38] 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
[75] 0 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0
[112] 1 0 1 1 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1
[149] 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 1
[186] 1 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1
[223] 0 1 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 1 0 1
[260] 0 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 1 0 0 0 1 0 1 0 1 0 0 0 0 1 0 0 0 1 0 1
[297] 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1
[334] 1 1 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0
[371] 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1
[408] 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[445] 0 0 1 0 0 1 0 0 0 1 1 1 0 0 1 0 0 1
chd.No <- as.numeric(Datos$chd == "No")
print(chd.No)
[1] 0 0 1 0 0 1 1 0 1 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1 0 1
[38] 1 1 0 0 1 1 0 1 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1
[75] 1 1 1 0 0 1 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1
[112] 0 1 0 0 1 0 1 0 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 1 1 1 0
[149] 0 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0
[186] 0 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0
[223] 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 1 0 1 1 0 0 1 0 1 1 0 1 1 0 1 1 0 0 1 0
[260] 1 0 1 1 1 0 1 1 1 1 0 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0
[297] 1 1 0 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 1 1 0
[334] 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 1
[371] 0 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 0 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0
[408] 0 1 1 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[445] 1 1 0 1 1 0 1 1 1 0 0 0 1 1 0 1 1 0
Datos2<-Datos[,-c(5,10)]
Datos2<-cbind(Datos2,famhist.Present)
Datos2<-cbind(Datos2,famhist.Absent)
Datos2<-cbind(Datos2,chd.Si)
Datos2<-cbind(Datos2,chd.No)
str(Datos2)
'data.frame': 462 obs. of 12 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity : num 23.1 28.6 32.3 38 27.8 ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ famhist.Present: num 1 0 1 1 1 1 0 1 1 1 ...
$ famhist.Absent : num 0 1 0 0 0 0 1 0 0 0 ...
$ chd.Si : num 1 1 0 1 1 0 0 1 0 1 ...
$ chd.No : num 0 0 1 0 0 1 1 0 1 0 ...
dim(Datos2)
[1] 462 12
head(Datos2)
setwd("~/Desktop/Datos")
Datos <- read.csv("SAheart.csv",header=TRUE, sep=";", dec=".")
library(dummies)
dummies-1.5.6 provided by Decision Patterns
Datos2 <- dummy.data.frame(Datos, sep = ".")
str(Datos2)
'data.frame': 462 obs. of 12 variables:
$ sbp : int 160 144 118 170 134 132 142 114 114 132 ...
$ tobacco : num 12 0.01 0.08 7.5 13.6 6.2 4.05 4.08 0 0 ...
$ ldl : num 5.73 4.41 3.48 6.41 3.5 6.47 3.38 4.59 3.83 5.8 ...
$ adiposity : num 23.1 28.6 32.3 38 27.8 ...
$ famhist.Absent : int 0 1 0 0 0 0 1 0 0 0 ...
$ famhist.Present: int 1 0 1 1 1 1 0 1 1 1 ...
$ typea : int 49 55 52 51 60 62 59 62 49 69 ...
$ obesity : num 25.3 28.9 29.1 32 26 ...
$ alcohol : num 97.2 2.06 3.81 24.26 57.34 ...
$ age : int 52 63 46 58 49 45 38 58 29 53 ...
$ chd.No : int 0 0 1 0 0 1 1 0 1 0 ...
$ chd.Si : int 1 1 0 1 1 0 0 1 0 1 ...
- attr(*, "dummies")=List of 2
..$ famhist: int [1:2] 5 6
..$ chd : int [1:2] 11 12
setwd("~/Desktop/Datos")
datos=read.csv("EjemploClientes.csv",sep = ";",dec=',',header=TRUE,row.names=1)
suppressMessages(suppressWarnings(library(FactoMineR)))
suppressMessages(suppressWarnings(library(car)))
Atipicos<-(Boxplot(~Edad, data=datos, id.method="y",col="Blue"))
Atipicos
[1] " Damaris "
Atipicos<-Boxplot(~Antiguedad, data=datos, id.method="y",col="Red")
Atipicos
[1] " Arnoldo "
Atipicos<-Boxplot(~Espacios.Parqueo, data=datos, id.method="y",col="Red")
Atipicos
[1] " Griselda " " Diana "
setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
dim(Datos)
[1] 506 14
str(Datos)
'data.frame': 506 obs. of 14 variables:
$ Crimin : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ ResidM25 : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ NegocMin : chr "Baja" "Baja" "Baja" "Baja" ...
$ LimitaRC : chr "No" "No" "No" "No" ...
$ OxNitr : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ Habitac : num 6.58 6.42 7.18 7 7.15 ...
$ VivA1940 : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ Distanc : num 4.09 4.97 4.97 6.06 6.06 ...
$ AccAutop : int 1 2 2 3 3 3 5 5 5 5 ...
$ Impuesto : int 296 242 242 222 222 222 311 311 311 311 ...
$ AlumProf : num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ Afroame : num 397 397 393 395 397 ...
$ ClaseBaja: num 4.98 9.14 4.03 2.94 5.33 ...
$ ValorProm: num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Datos
# Elimino variables categóricas
Datos2 <- Datos[,-c(3,4)]
dim(Datos2)
[1] 506 12
str(Datos)
'data.frame': 506 obs. of 14 variables:
$ Crimin : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
$ ResidM25 : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
$ NegocMin : chr "Baja" "Baja" "Baja" "Baja" ...
$ LimitaRC : chr "No" "No" "No" "No" ...
$ OxNitr : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
$ Habitac : num 6.58 6.42 7.18 7 7.15 ...
$ VivA1940 : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
$ Distanc : num 4.09 4.97 4.97 6.06 6.06 ...
$ AccAutop : int 1 2 2 3 3 3 5 5 5 5 ...
$ Impuesto : int 296 242 242 222 222 222 311 311 311 311 ...
$ AlumProf : num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
$ Afroame : num 397 397 393 395 397 ...
$ ClaseBaja: num 4.98 9.14 4.03 2.94 5.33 ...
$ ValorProm: num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
Datos2
library(corrplot)
matriz.correlacion<-cor(Datos2)
corrplot(matriz.correlacion)
# Funciones parav el gráfico de pares
lower <- function(data, mapping){
p <- ggplot(data = data, mapping = mapping) +
geom_point(size = 1,col = "dodgerblue3") +
geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
return(p)
}
diag <- function(data, mapping){
p <- ggplot(data = data, mapping = mapping) +
geom_histogram(bins = 30, col = "black", fill = "#F8766D", aes(y=..density..)) +
geom_density()
return(p)
}
library(tidyverse) # Para usar la función select_if
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.0 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ lubridate 1.9.2 ✔ tibble 3.1.8
✔ purrr 1.0.1 ✔ tidyr 1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::group_rows() masks kableExtra::group_rows()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::recode() masks car::recode()
✖ purrr::some() masks car::some()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
ggpairs(select_if(Datos2, is.numeric) %>% dplyr::select(-ValorProm, everything()), # el select posiciona la variable a predecir al final, para que se grafique bien en caso de que solo haya 1 variable a predecir
lower = list(continuous = lower), diag = list(continuous = diag))
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
corona <- c(1,5,9,9,13,22,23,26,27,35,41,50,69,87,113,117,134,158,177,201,231)
corona
[1] 1 5 9 9 13 22 23 26 27 35 41 50 69 87 113 117 134 158 177
[20] 201 231
barplot(corona,col = "green")
casos.nuevos.dia<-diff(corona)
casos.nuevos.dia
[1] 4 4 0 4 9 1 3 1 8 6 9 19 18 26 4 17 24 19 24 30
length(casos.nuevos.dia)
[1] 20
barplot(casos.nuevos.dia,col = "blue")
eje.x<-seq(1:(length(corona)-1))
eje.x
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
datos<-rbind(eje.x,casos.nuevos.dia)
datos
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
eje.x 1 2 3 4 5 6 7 8 9 10 11 12
casos.nuevos.dia 4 4 0 4 9 1 3 1 8 6 9 19
[,13] [,14] [,15] [,16] [,17] [,18] [,19] [,20]
eje.x 13 14 15 16 17 18 19 20
casos.nuevos.dia 18 26 4 17 24 19 24 30
datos<-t(datos)
datos<-as.data.frame(datos)
datos
modelo.lm <- lm(casos.nuevos.dia~eje.x,data=datos)
modelo.lm
Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)
Coefficients:
(Intercept) eje.x
-2.521 1.335
summary(modelo.lm)
Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)
Residuals:
Min 1Q Median 3Q Max
-13.5090 -3.3323 -0.1677 4.0989 9.8263
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.5211 2.6123 -0.965 0.347
eje.x 1.3353 0.2181 6.123 8.76e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.623 on 18 degrees of freedom
Multiple R-squared: 0.6757, Adjusted R-squared: 0.6576
F-statistic: 37.5 on 1 and 18 DF, p-value: 8.76e-06
plot(datos$eje.x,datos$casos.nuevos.dia,col="red")
abline(modelo.lm,col="blue")
pred<-predict(modelo.lm,data.frame(eje.x=c(21,22,23)),interval="confidence")
pred
fit lwr upr
1 25.52105 20.03287 31.00924
2 26.85639 20.96250 32.75028
3 28.19173 21.88495 34.49850
# El 4 es un dato atípico
casos.nuevos.dia
[1] 4 4 0 4 9 1 3 1 8 6 9 19 18 26 4 17 24 19 24 30
casos.nuevos.dia[15]<-mean(casos.nuevos.dia)
casos.nuevos.dia
[1] 4.0 4.0 0.0 4.0 9.0 1.0 3.0 1.0 8.0 6.0 9.0 19.0 18.0 26.0 11.5
[16] 17.0 24.0 19.0 24.0 30.0
datos<-rbind(eje.x,casos.nuevos.dia)
datos<-t(datos)
datos<-as.data.frame(datos)
modelo.lm <- lm(casos.nuevos.dia~eje.x,data=datos)
modelo.lm
Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)
Coefficients:
(Intercept) eje.x
-2.679 1.386
summary(modelo.lm)
Call:
lm(formula = casos.nuevos.dia ~ eje.x, data = datos)
Residuals:
Min 1Q Median 3Q Max
-7.410 -3.682 -0.568 4.117 9.274
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.6789 2.2394 -1.196 0.247
eje.x 1.3861 0.1869 7.415 7.11e-07 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.821 on 18 degrees of freedom
Multiple R-squared: 0.7533, Adjusted R-squared: 0.7396
F-statistic: 54.98 on 1 and 18 DF, p-value: 7.107e-07
plot(datos$eje.x,datos$casos.nuevos.dia,col="red")
abline(modelo.lm,col="blue")
pred<-predict(modelo.lm,data.frame(eje.x=c(21,22,23)),interval="confidence")
pred
fit lwr upr
1 26.42895 21.72419 31.13371
2 27.81504 22.76249 32.86758
3 29.20113 23.79463 34.60763
modelo.lm <- lm(ValorProm~ClaseBaja, data = Datos2)
modelo.lm
Call:
lm(formula = ValorProm ~ ClaseBaja, data = Datos2)
Coefficients:
(Intercept) ClaseBaja
34.55 -0.95
summary(modelo.lm)
Call:
lm(formula = ValorProm ~ ClaseBaja, data = Datos2)
Residuals:
Min 1Q Median 3Q Max
-15.168 -3.990 -1.318 2.034 24.500
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 34.55384 0.56263 61.41 <2e-16 ***
ClaseBaja -0.95005 0.03873 -24.53 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
plot(Datos2$ClaseBaja,Datos2$ValorProm,col="red")
abline(modelo.lm,col="blue")
pred<-predict(modelo.lm,data.frame(ClaseBaja=c(8,10,15)),interval="confidence")
pred
fit lwr upr
1 26.95345 26.30529 27.60161
2 25.05335 24.47413 25.63256
3 20.30310 19.73159 20.87461
# Acepta variales categóricas sin problema
modelo.lm <- lm(ValorProm~., data = Datos2)
modelo.lm
Call:
lm(formula = ValorProm ~ ., data = Datos2)
Coefficients:
(Intercept) Crimin ResidM25 OxNitr Habitac VivA1940
36.711949 -0.114059 0.046099 -16.690209 3.825925 0.002839
Distanc AccAutop Impuesto AlumProf Afroame ClaseBaja
-1.513485 0.316561 -0.012696 -0.980534 0.009692 -0.531803
summary(modelo.lm)
Call:
lm(formula = ValorProm ~ ., data = Datos2)
Residuals:
Min 1Q Median 3Q Max
-13.3346 -2.7771 -0.5547 1.8853 26.3736
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 36.711949 5.136151 7.148 3.19e-12 ***
Crimin -0.114059 0.033064 -3.450 0.000609 ***
ResidM25 0.046099 0.013762 3.350 0.000871 ***
OxNitr -16.690209 3.707108 -4.502 8.40e-06 ***
Habitac 3.825925 0.419477 9.121 < 2e-16 ***
VivA1940 0.002839 0.013301 0.213 0.831059
Distanc -1.513485 0.196417 -7.705 7.20e-14 ***
AccAutop 0.316561 0.064028 4.944 1.05e-06 ***
Impuesto -0.012696 0.003396 -3.738 0.000207 ***
AlumProf -0.980534 0.130351 -7.522 2.56e-13 ***
Afroame 0.009692 0.002703 3.585 0.000371 ***
ClaseBaja -0.531803 0.050915 -10.445 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 4.784 on 494 degrees of freedom
Multiple R-squared: 0.7353, Adjusted R-squared: 0.7294
F-statistic: 124.7 on 11 and 494 DF, p-value: < 2.2e-16
\(\huge RSS = \sum_{i=1}^{n}{(y_i-\hat{y}_i)^2}\)
# Residual Sum of Square (RSS)
RSS <- function(Pred,Real) {
ss <- sum((Real-Pred)^2)
return(ss)
}
# NumPred es el número total de predictores por eso se resta 1 (que es realidad sumar 1)
RSE<-function(Pred,Real,NumPred) {
N<-length(Real)-NumPred-1 # <- length(Real)-(NumPred+1)
ss<-sqrt((1/N)*RSS(Pred,Real))
return(ss)
}
\(\huge RSS = \frac{1}{n}\sum_{i=1}^n{(y_i-\hat{y}_i)^2}\)
MSE <- function(Pred,Real) {
N<-length(Real)
ss<-(1/N)*RSS(Pred,Real)
return(ss)
}
\(\huge error.relativo = \frac{\sum_{i=1}^n{|y_i-\hat{y}_i|}}{\sum_{i = 1}^n|y_i|}\)
error.relativo <- function(Pred,Real) {
ss<-sum(abs(Real-Pred))/sum(abs(Real))
return(ss)
}
# Funciones para desplegar precisión
indices.precision <- function(real, prediccion,cantidad.variables.predictoras) {
return(list(error.cuadratico = MSE(prediccion,real),
raiz.error.cuadratico = RSE(prediccion,real,cantidad.variables.predictoras),
error.relativo = error.relativo(prediccion,real),
correlacion = as.numeric(cor(prediccion,real))))
}
# Gráfico de dispersión entre el valor real de la variable a predecir y la predicción del modelo.
plot.real.prediccion <- function(real, prediccion, modelo = "") {
g <- ggplot(data = data.frame(Real = real, Prediccion = as.numeric(prediccion)), mapping = aes(x = Real, y = Prediccion)) +
geom_point(size = 1, col = "dodgerblue3") +
labs(title = paste0("Real vs Predicción", ifelse(modelo == "", "", paste(", con", modelo))),
x = "Real",
y = "Predicción")
return(g)
}
setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
# Elimino variables categóricas
Datos2 <- Datos[,-c(3,4)]
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(Datos2)[2] - 1
numero.filas <- dim(Datos2)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.aprendizaje <- Datos2[-muestra, ]
datos.prueba <- Datos2[muestra, ]
datos.aprendizaje
datos.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(ValorProm~., data = datos.aprendizaje)
modelo.lm
Call:
lm(formula = ValorProm ~ ., data = datos.aprendizaje)
Coefficients:
(Intercept) Crimin ResidM25 OxNitr Habitac VivA1940
29.127664 -0.115590 0.031764 -12.277622 4.443833 -0.006966
Distanc AccAutop Impuesto AlumProf Afroame ClaseBaja
-1.288157 0.314014 -0.011023 -1.006278 0.012309 -0.518149
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prueba$ValorProm, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 25.65676
$raiz.error.cuadratico
[1] 5.279369
$error.relativo
[1] 0.163729
$correlacion
[1] 0.8457587
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prueba$ValorProm, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'
library(dummies)
setwd("~/Desktop/Datos")
Datos <- read.csv("BostonCasas.csv",row.names=1,header=TRUE, sep=",", dec=".")
# Convierto las variables categóricas en Dummies
Datos2 <- dummy.data.frame(Datos, sep = ".")
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(Datos2)[2] - 1
numero.filas <- dim(Datos2)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.aprendizaje <- Datos2[-muestra, ]
datos.prueba <- Datos2[muestra, ]
datos.aprendizaje
datos.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(ValorProm~., data = datos.aprendizaje)
modelo.lm
Call:
lm(formula = ValorProm ~ ., data = datos.aprendizaje)
Coefficients:
(Intercept) Crimin ResidM25 NegocMin.Alta NegocMin.Baja
32.31317 -0.15832 0.04641 3.08147 1.66204
NegocMin.Media LimitaRC.No LimitaRC.Si OxNitr Habitac
NA -2.75387 NA -18.55975 4.24971
VivA1940 Distanc AccAutop Impuesto AlumProf
-0.01286 -1.51565 0.43668 -0.01224 -0.82180
Afroame ClaseBaja
0.01067 -0.48175
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prueba$ValorProm, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 31.45123
$raiz.error.cuadratico
[1] 5.953261
$error.relativo
[1] 0.1674544
$correlacion
[1] 0.8308064
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prueba$ValorProm, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'
Estos datos provienen de un estudio que examinó la correlación entre el nivel de antígeno prostático específico y una serie de medidas clínicas en hombres que estaban a punto de recibir una prostatectomía radical. Es un data.frame con 97 filas y 9 columnas.
Este contiene las siguiente variables :
setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
head(datos.prostata)
correlaciones <- cor(datos.prostata)
correlaciones
lcavol lweight age lbph svi lcp
lcavol 1.0000000 0.28052138 0.2249999 0.027349703 0.53884500 0.675310484
lweight 0.2805214 1.00000000 0.3479691 0.442264399 0.15538490 0.164537142
age 0.2249999 0.34796911 1.0000000 0.350185896 0.11765804 0.127667752
lbph 0.0273497 0.44226440 0.3501859 1.000000000 -0.08584324 -0.006999431
svi 0.5388450 0.15538490 0.1176580 -0.085843238 1.00000000 0.673111185
lcp 0.6753105 0.16453714 0.1276678 -0.006999431 0.67311118 1.000000000
gleason 0.4324171 0.05688209 0.2688916 0.077820447 0.32041222 0.514830063
pgg45 0.4336522 0.10735379 0.2761124 0.078460018 0.45764762 0.631528246
lpsa 0.7344603 0.43331938 0.1695928 0.179809404 0.56621822 0.548813175
gleason pgg45 lpsa
lcavol 0.43241706 0.43365225 0.7344603
lweight 0.05688209 0.10735379 0.4333194
age 0.26889160 0.27611245 0.1695928
lbph 0.07782045 0.07846002 0.1798094
svi 0.32041222 0.45764762 0.5662182
lcp 0.51483006 0.63152825 0.5488132
gleason 1.00000000 0.75190451 0.3689868
pgg45 0.75190451 1.00000000 0.4223159
lpsa 0.36898681 0.42231586 1.0000000
corrplot(correlaciones)
ggpairs(select_if(datos.prostata, is.numeric) %>% dplyr::select(-lpsa, everything()), # el select posiciona la variable a predecir al final, para que se grafique bien en caso de que solo haya 1 variable a predecir
lower = list(continuous = lower), diag = list(continuous = diag))
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
lm una formula, en R se puede definir una
formula de la siguiente manera: y ~ x donde y sería lpsa y
x lcavol.modelo <- lm(lpsa ~ lcavol, data = datos.prostata)
modelo
Call:
lm(formula = lpsa ~ lcavol, data = datos.prostata)
Coefficients:
(Intercept) lcavol
1.5073 0.7193
coef(modelo)
(Intercept) lcavol
1.5072975 0.7193204
x <- datos.prostata$lcavol
b0 <- coef(modelo)[1]
b0
(Intercept)
1.507297
b1 <- coef(modelo)[2]
b1
lcavol
0.7193204
b0 + b1 * x
[1] 1.0902222 0.7921115 1.1398502 0.6412553 2.0478064 0.7521390 2.0375546
[8] 2.0058924 0.9487245 1.6678092 1.6904668 0.5383199 2.6678705 2.5697687
[15] 2.3747769 2.6158846 1.2084087 3.1534522 1.1029539 1.6384451 2.3326474
[22] 2.9885499 1.1154641 2.7889172 1.7844246 2.5480958 1.8761819 1.2192258
[29] 2.2555897 3.2406036 1.7124325 1.6384451 2.4246919 1.5144549 1.5000680
[36] 2.4484079 2.5309683 1.8363325 3.4213792 2.0809606 1.9536908 2.5447028
[43] 1.9260970 2.7816144 2.5763080 2.7041934 3.4694976 2.3439756 2.7630262
[50] 2.3854653 2.2927402 2.7014636 1.8761819 3.0373211 3.7757393 2.4186387
[57] 2.2083181 1.8408708 1.8974024 2.2706809 1.8363325 2.9440807 3.5039214
[64] 2.9709027 2.9985723 2.5565090 2.9623900 3.0886047 1.1862740 2.3661102
[71] 2.8481683 2.3417242 2.3812090 2.8300997 3.6647020 3.7667767 2.9537752
[78] 3.3326860 3.4122738 3.5066055 2.5631694 3.3154215 3.3868864 3.4333433
[85] 2.6311250 3.8831043 2.9633408 2.7529126 3.5268570 2.6311250 3.8425646
[92] 3.3292661 3.5431668 4.2558233 3.5986836 3.5807842 4.0047537
plot(datos.prostata$lcavol, datos.prostata$lpsa, col = "black",
main = "Recta de mejor ajuste para las variables lpsa y lcavol",
xlab = "log(volumen de cáncer)",
ylab = "log(antígeno prostático específico)",
pch = 19)
grid(6, 6, lwd = 2)
abline(modelo, col = "red")
nuevos.datos <- data.frame(lcavol=c(0.70, 0.41, -0.23))
nuevos.datos
prediccion <- predict(modelo, nuevos.datos , interval = "confidence")
prediccion
fit lwr upr
1 2.010822 1.829324 2.192319
2 1.802219 1.598768 2.005670
3 1.341854 1.075486 1.608221
plot(x = nuevos.datos$lcavol, y = prediccion[,1]
, col = "black",
main = "Predicción de valores de lpsa dados nuevos valores de lcavol",
xlab = "log(volumen de cáncer)",
ylab = "log(antígeno prostático específico)",
pch = 19)
grid(6, 6, lwd = 2)
modelo <- lm(lpsa ~ age, data = datos.prostata)
modelo
Call:
lm(formula = lpsa ~ age, data = datos.prostata)
Coefficients:
(Intercept) age
0.79906 0.02629
coef(modelo)
(Intercept) age
0.79906018 0.02629454
plot(x = datos.prostata$age,
y = datos.prostata$lpsa,
col = "black",
main = "Recta de mejor ajuste para las variables lpsa y age",
xlab = "Edad en años",
ylab = "log(antígeno prostático específico)",
pch = 19)
grid(6, 6, lwd = 2)
abline(modelo,col="red")
nuevos.datos <- data.frame(age=c(42, 55, 71, 77))
prediccion <- predict(modelo,nuevos.datos,interval = "confidence")
prediccion
fit lwr upr
1 1.903431 1.184926 2.621936
2 2.245260 1.885712 2.604808
3 2.665973 2.345921 2.986024
4 2.823740 2.354460 3.293020
plot(x = nuevos.datos$age, y = prediccion[,1]
, col = "black",
main = "Predicción de valores de lpsa dados nuevos valores de age",
xlab = "Edad en años",
ylab = "log(antígeno prostático específico)",
pch = 19)
grid(6, 6, lwd = 2)
Para estos ejemplo utilizaremos nuevamente los datos de cáncer de prostáta ya presentados en el ejemplo anterior.
setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
dim(datos.prostata)
[1] 97 9
datos.prostata
datos.prostata.nuevos <- read.csv("newprostate_data.csv", header=TRUE, dec=",", sep=";")
dim(datos.prostata.nuevos)
[1] 10 9
datos.prostata.nuevos
modelo <- lm(lpsa ~ ., data = datos.prostata)
modelo
Call:
lm(formula = lpsa ~ ., data = datos.prostata)
Coefficients:
(Intercept) lcavol lweight age lbph svi
0.181561 0.564341 0.622020 -0.021248 0.096713 0.761673
lcp gleason pgg45
-0.106051 0.049228 0.004458
summary(modelo)
Call:
lm(formula = lpsa ~ ., data = datos.prostata)
Residuals:
Min 1Q Median 3Q Max
-1.76644 -0.35510 -0.00328 0.38087 1.55770
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.181561 1.320568 0.137 0.89096
lcavol 0.564341 0.087833 6.425 6.55e-09 ***
lweight 0.622020 0.200897 3.096 0.00263 **
age -0.021248 0.011084 -1.917 0.05848 .
lbph 0.096713 0.057913 1.670 0.09848 .
svi 0.761673 0.241176 3.158 0.00218 **
lcp -0.106051 0.089868 -1.180 0.24115
gleason 0.049228 0.155341 0.317 0.75207
pgg45 0.004458 0.004365 1.021 0.31000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6995 on 88 degrees of freedom
Multiple R-squared: 0.6634, Adjusted R-squared: 0.6328
F-statistic: 21.68 on 8 and 88 DF, p-value: < 2.2e-16
setwd("~/Desktop/Datos")
datos.prostata <- read.csv("prostate_data.csv", header=TRUE, dec=",", sep=";")
# División Training y Testing, 70% para entrenamiento y 30% para testing
numero.predictoras <- dim(datos.prostata)[2] - 1
numero.filas <- dim(datos.prostata)[1]
muestra <- sample(1:numero.filas,numero.filas*0.3)
datos.prostata.aprendizaje <- datos.prostata[muestra, ]
datos.prostata.prueba <- datos.prostata[-muestra, ]
datos.prostata.aprendizaje
datos.prostata.prueba
# Calcula el modelo usando solo los datos de training
modelo.lm <- lm(lpsa~., data = datos.prostata.aprendizaje)
modelo.lm
Call:
lm(formula = lpsa ~ ., data = datos.prostata.aprendizaje)
Coefficients:
(Intercept) lcavol lweight age lbph svi
0.8750968 0.4166023 0.3497793 -0.0508304 0.2269494 1.5309739
lcp gleason pgg45
0.1773820 0.3734952 0.0005517
# Hace la Predicción
prediccion <- predict(modelo.lm, datos.prostata.prueba)
# Medición de precisión
pre.lm <- indices.precision(datos.prostata.prueba$lpsa, prediccion,numero.predictoras)
pre.lm
$error.cuadratico
[1] 1.137701
$raiz.error.cuadratico
[1] 1.145097
$error.relativo
[1] 0.3297569
$correlacion
[1] 0.6730252
# Gráfico real vs predicción, con curva de mejor ajuste lineal
g <- plot.real.prediccion(datos.prostata.prueba$lpsa, prediccion, modelo = "Regresión Lineal")
g + geom_smooth(method = lm, size = 0.4, color = "red", se = FALSE)
`geom_smooth()` using formula = 'y ~ x'
prediccion <- predict(modelo,datos.prostata.nuevos, interval="confidence")
prediccion
fit lwr upr
1 2.0738156 1.6676943 2.479937
2 2.4074603 1.9332538 2.881667
3 0.7555028 0.2847731 1.226232
4 2.0666468 1.4775007 2.655793
5 1.4300652 0.6443919 2.215738
6 1.6846533 1.3283367 2.040970
7 3.5046107 3.0312808 3.977941
8 2.5671972 2.0767038 3.057691
9 4.4720592 3.8658599 5.078258
10 4.1548343 3.6786098 4.631059
modelo <- lm(lpsa ~ lcavol + lweight + age, data = datos.prostata)
modelo
Call:
lm(formula = lpsa ~ lcavol + lweight + age, data = datos.prostata)
Coefficients:
(Intercept) lcavol lweight age
-0.30084 0.66191 0.72849 -0.01187
prediccion <- predict(modelo, datos.prostata.nuevos, interval = "confidence")
prediccion
fit lwr upr
1 2.1319974 1.8171223 2.446872
2 2.6319604 2.4543466 2.809574
3 0.5687773 0.1292658 1.008289
4 2.1260295 1.8442754 2.407784
5 0.9913164 0.4357991 1.546834
6 1.8741506 1.6541980 2.094103
7 3.7620126 3.4022130 4.121812
8 2.8544516 2.4309001 3.278003
9 4.3463853 3.7516738 4.941097
10 4.1521519 3.8288800 4.475424
modelo <- lm(lpsa~lcavol+svi,data=datos.prostata)
# Usando newprostate_data.csv para testing
pred<-predict(modelo,datos.prostata.nuevos)
# Se usa NumVar=2 porque tenemos 2 predictores
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,2)
pre.lm
$error.cuadratico
[1] 0.935128
$raiz.error.cuadratico
[1] 1.15581
$error.relativo
[1] 0.267384
$correlacion
[1] 0.8068148
modelo<-lm(lpsa~.,data=datos.prostata)
# Usando newprostate_data.csv para testing
pred<-predict(modelo,datos.prostata.nuevos)
# dim(PDatos)[2]-1 = Número de Predictores que es lo que recibe la función
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,dim(datos.prostata.nuevos)[2]-1)
pre.lm
$error.cuadratico
[1] 1.04659
$raiz.error.cuadratico
[1] 3.235104
$error.relativo
[1] 0.2930132
$correlacion
[1] 0.7931361
# Con 1 variable predictora
modelo<-lm(lpsa~lcavol,data=datos.prostata)
# Con la tabla de testing
pred<-predict(modelo,datos.prostata.nuevos)
# Se usa NumVar=1 porque tenemos 1 predictor
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,1)
pre.lm
$error.cuadratico
[1] 1.229151
$raiz.error.cuadratico
[1] 1.239532
$error.relativo
[1] 0.3058011
$correlacion
[1] 0.711853
# Con 3 variables predictoras
modelo<-lm(lpsa~lcavol+svi+lweight,data=datos.prostata)
# Con la tabla de testing
pred<-predict(modelo,datos.prostata.nuevos)
# Se usa NumVar=3 porque tenemos 3 predictores
pre.lm <- indices.precision(datos.prostata.nuevos$lpsa,pred,3)
pre.lm
$error.cuadratico
[1] 0.9961928
$raiz.error.cuadratico
[1] 1.288535
$error.relativo
[1] 0.2934903
$correlacion
[1] 0.8063431